import os
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException


"""
    Author:
        Damon
    功能：
        爬取豆瓣网Top250电影信息保存到本地
"""


# 目标网址
URL = "https://movie.douban.com/top250?start={}"
# 按照爬取顺序保存每个电影的网址
entity_url = []


def save_data(result):
    """
    保存爬取信息到本地
    :return: None
    """
    f = open('movice.txt', "a", encoding="utf8")
    f.write("========================================================================================================\n")
    f.write("排名：" + result['top'] + "\n")
    f.write("评分：" + result['grade'] + "\n")
    f.write("名称：" + result['name'] + "\n")
    f.write("导演：" + result['director'] + "\n")
    f.write("编剧：" + result['scriptwriter'] + "\n")
    f.write("主演：" + result['protagonist'] + "\n")
    f.write("简介：" + result['synopsis'] + "\n")
    f.write("影评：" + "\n")
    f.write("\t" + result['film_review']['first_user'] + "：" + result['film_review']['first_discuss'] + "\n")
    f.write("\t" + result['film_review']['second_user'] + "：" + result['film_review']['second_discuss'] + "\n")
    f.write("\t" + result['film_review']['thirdly_user'] + "：" + result['film_review']['thirdly_discuss'] + "\n")
    f.write("\t" + result['film_review']['fourthly_user'] + "：" + result['film_review']['fourthly_discuss'] + "\n")
    f.write("\t" + result['film_review']['fifth_user'] + "：" + result['film_review']['fifth_discuss'] + "\n")
    f.write("网址" + result['url'] + "\n\n")
    f.close()

    print("已处理：" + result['name'] + "\t" + result['top'])


def analysis_page(num, url):
    """
    解析网页，获取想要的数据
    :param num: Top排行
    :param url: 电影详情url
    :return: None
    """
    # 保存电影整体信息
    result = {}
    # 保存影评信息
    film_review = {}
    try:
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        }
        res = requests.get(url, headers=headers)
        res.encoding = "utf-8"
    except RequestException as e:
        print("请求索引页异常：", repr(e))
        print("链接：", url)
        # [待处理异常] 目标网址崩溃的异常处理。 解决方法：退出程序
        os._exit(0)

    soup = BeautifulSoup(res.text, "html.parser")

    # 如果该网页不存在则 跳过 执行下一个网页
    title = soup.select("title")[0].text
    if title == "页面不存在":
        f = open('movice.txt', "a", encoding="utf8")
        f.write("========================================================================================================\n")
        f.write("排名：Top" + str(num) + "\n")
        f.write("ERROR：页面不存在\n")
        f.write("网址：" + url  + "\n\n")
        f.close()
        return -1

    try:
        # 排名
        result['top'] = "Top" + str(num)
        # 评分
        result['grade'] = soup.select("#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")[0].text
        # 名称
        result['name'] = soup.select("#content > h1")[0].text.replace("\n", "")
        # 导演
        result['director'] = soup.select("#info > span > span.attrs")[0].text
        try:
            # 编剧
            result['scriptwriter'] = soup.select("#info > span > span.attrs")[1].text
            # 主演
            result['protagonist'] = soup.select("#info > span.actor > span.attrs")[0].text
        except:
            # 编剧
            result['scriptwriter'] = ""
            # 主演
            result['protagonist'] = ""
        try:
            # 简介
            result['synopsis'] = soup.select("#link-report > span.short > span")[0].text.replace("\n", "").replace(" ", "")
        except:
            # 简介
            result['synopsis'] = soup.select("#link-report > span")[0].text.replace("\n", "").replace(" ", "")
        # 第一个影评用户名
        film_review['first_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[0].text
        # 第一个影评用户评论
        film_review['first_discuss'] = soup.select("#hot-comments > div > div > p")[0].text
        # 第二个影评用户名
        film_review['second_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[1].text
        # 第二个影评用户评论
        film_review['second_discuss'] = soup.select("#hot-comments > div > div > p")[1].text
        # 第三个影评用户名
        film_review['thirdly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[2].text
        # 第三个影评用户评论
        film_review['thirdly_discuss'] = soup.select("#hot-comments > div > div > p")[2].text
        # 第四个影评用户名
        film_review['fourthly_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[3].text
        # 第四个影评用户评论
        film_review['fourthly_discuss'] = soup.select("#hot-comments > div > div > p")[3].text
        # 第五个影评用户名
        film_review['fifth_user'] = soup.select("#hot-comments > div > div > h3 > span.comment-info > a")[4].text
        # 第五个影评用户评论
        film_review['fifth_discuss'] = soup.select("#hot-comments > div > div > p")[4].text
        # 影评
        result['film_review'] = film_review
        # 网址
        result['url'] = url
    except:
        print("异常链接：", url, "------------------------------------")
        # [待处理异常] 目标网址崩溃的异常处理。 解决方法：退出程序
        os._exit(0)

    # 保存数据到本地 txt
    save_data(result)


def get_entity_url(url):
    """
    爬取目标网址中每一个电影的网址
    :param url: 目标网址
    :return: None
    """
    try:
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                          "(KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
        }
        res = requests.get(url, headers=headers)
        res.encoding = "utf-8"
    except RequestException as e:
        print("请求索引页异常：", repr(e))
        print("链接：", url)
        # [待处理异常] 目标网址崩溃的异常处理。 解决方法：退出程序
        os._exit(0)

    soup = BeautifulSoup(res.text, "html.parser")

    entity = soup.select("#content > div > div.article > ol > li > div > div.info > div.hd > a")
    for i in range(len(entity)):
        entity_url.append(entity[i]['href'])


def make_url(num):
    """
    生成每页URL
    :param num: 待生成网页 index
    :return: None
    """
    url = URL.format(num * 25)
    get_entity_url(url)

if __name__ == '__main__':
    # 获取所有电影的url
    for i in range(10):
        make_url(i)
    print("已成功获取所有电影URL！")
    # 根据获取到的url解析出想要的数据保存到本地
    for i in range(len(entity_url)):
        state = analysis_page((i + 1), entity_url[i])

        if state == -1:
            continue
